Introduction and objective
The main objective of the present work is to analyze the World Happiness Report Index 2019,to get insights to answer:
Does money buy happiness?
For this, we will be working on various datasets.
Our main dataset will be the data downloaded from the World Happiness Report. The second data set that we are going to work with is the population dataset downloaded from the open repository of the World Bank for the period: 1960-2018. In addition, we will be considering a dataset used in class, that uses the happiness 2018 index and has a column detailing the regions of the countries in our previous datasets. And finally, we will be downloading the world map, to construct the base layer of our world maps to present the happiness information.
library(gifski)
library(ggridges)
library(PerformanceAnalytics)
library(corrplot)
library(plotly)
library(ggpubr)
library(MASS)
library(reshape)
library(readxl)
library(ggplot2)
library(dplyr)
library(RColorBrewer)
library(dplyr)
library(maps)
library(stringr)
library(tidyverse)
library(gganimate)
library(gifski)
library(png)
library(streamgraph)setwd ("C:/Users/user/Desktop/classes/sem3/avd_R/Advanced_Visualization_in_R")
#Loading the first dataframe: population from World Bankl 1960-2018
population <- read_excel("C:/Users/user/Desktop/classes/sem3/avd_R/project/melt.xls", skip = 2)
as.data.frame(population)
#Melting the dataframe population to get panel data of the years
population2<-reshape2::melt(population, id= c ("Country Name", "Country Code", "Indicator Name" ,"Indicator Code"))
#Loading the second dataframe, which is our main dataframe: information was downloaded from the World Happiness Report, 2018
data<-read_excel("C:/Users/user/Desktop/classes/sem3/avd_R/project/Chapter2OnlineData.xls")
#Loading the third dataframe: happy, that links countries with regions
happy <- read.csv("C:/Users/user/Desktop/classes/sem3/visual_R/Class_04/AdvancedVisualizationR-2019/Dane/happy2015.csv")
#Renaming the heterogeneous names in both dataframes and ordering by country name
population2 <-population2[order(population2$`Country Name`),]
names(population2)[names(population2) == "Country Name"] <- "Country_name"
names(data)[names(data) == "Country name"] <- "Country_name"
names(population2)[names(population2) == "variable"] <- "Year"
names(population2)[names(population2) == "value"] <- "Population"
names(happy)[names(happy) == "Country"] <- "Country_name"
happy <-happy[order(happy$Country_name),]
#Finding the differences between data and population dataframes: the following command shows all the names existing in the first dataframe, but not in the second.
setdiff(data$Country_name, population2$Country_name)
#Renamed the countries for population2
population2$Country_name<-recode(population2$Country_name,
"Russian Federation"= "Russia",
"Iran, Islamic Rep."= "Iran",
"Egypt, Arab Rep."= "Egypt",
"Gambia, The" = "Gambia",
"Kyrgyz Republic" = "Kyrgyzstan",
"Lao PDR" = "Laos",
"Slovak Republic" = "Slovakia",
"Syrian Arab Republic" = "Syria",
"Yemen, Rep."= "Yemen",
"Venezuela, RB" = "Venezuela",
"Cote d'Ivoire"= "Ivory Coast",
"Hong Kong SAR, China" = "Hong Kong S.A.R. of China",
"Congo, Rep." = "Congo (Brazzaville)",
"Congo, Dem. Rep."= "Congo (Kinshasa)" ,
"Korea, Rep." = "South Korea",
"North Macedonia" = "Macedonia",
"Cyprus"= "North Cyprus",
"Somalia" = "Somaliland region")
#Merge the main dataframe and population2 by country name
total <- merge(data, population2, by=c("Country_name", 'Year'))
#Delete the columns that we do not need using select function
total=subset(total,select=-c(17,20:26,28,29))
setdiff(total$Country_name, happy$Country_name)
happy$Country_name<-recode(happy$Country_name,
"Hong Kong"= "Hong Kong S.A.R. of China",
"Sudan" ="South Sudan")
#Selecting only regions from "happy" dataframe
regions<-happy[0:2]
#Merge total and regions dataframes by country name
total <- merge(total, regions, by=c("Country_name"))
#Uploading the world map
world <- map_data("world")
#region is the country name, so lets rename it
names(world)[names(world) == "region"] <- "Country_name"
#Finding the differences between total and world:
setdiff(total$Country_name, world$Country_name)
#And changing in world
world$Country_name<-recode(world$Country_name,
"USA" = "United States",
"UK"= "United Kingdom",
"Cyprus" ="North Cyprus",
"Republic of Congo" = "Congo (Brazzaville)",
"Democratic Republic of the Congo" = "Congo (Kinshasa)",
"Hong Kong" = "Hong Kong S.A.R. of China",
"Somalia"= "Somaliland region" ,
"Trinidad" = "Trinidad and Tobago")Happiest regions in the World, 2005-2018
#Visualizing the happiest regions in the complete time scope
table_1<-total %>% group_by(Region) %>%
summarise(Mean.response = mean(`Life Ladder`))
as.data.frame.matrix(table_1) ordered_table_1 <- table_1[order(-(table_1$Mean.response)),]
ordered_table_1## # A tibble: 10 x 2
## Region Mean.response
## <fct> <dbl>
## 1 Australia and New Zealand 7.31
## 2 North America 7.27
## 3 Western Europe 6.84
## 4 Latin America and Caribbean 6.01
## 5 Eastern Asia 5.47
## 6 Middle East and Northern Africa 5.45
## 7 Southeastern Asia 5.34
## 8 Central and Eastern Europe 5.31
## 9 Southern Asia 4.59
## 10 Sub-Saharan Africa 4.25
The present table describes the happiest regions considering the complete available time scope.
Some insights reflect that in the period 2005-2018, the happiest region was Australia and New Zealand with a happiness score of 7.31, followed by North America with a happiness score of 7.27 and Western Europe with 6.84 as a score. From this table we can derive our first educated guess: Wealthier regions are on overage happier in contrast with the poorer ones. We can see this effect clearly if we compare the unhappiest region: Sub-Saharan Africa with a score lower than half of the index (4.25), followed by Southern Asia (4.59)
- Note: the means are in descending order
World Map happiness index heatmap, 2017
#From the dataframe "total" we take only life ladder, country name of the year 2018 and creating a new data frame happy_2017 and then join it with the "world" dataframe
happy_2017<-subset(total, total$Year==2017)
happy_2017 <- happy_2017 %>%
dplyr::select("Country_name", "Life Ladder", "Country Code", "Region", "Year", "Population")
happySubset_17 <- inner_join(world, happy_2017, by = "Country_name")
# Creating the mean coordinates to get one coordinate of latitude and longitude per country
test <-
happySubset_17 %>%
group_by(Country_name) %>%
summarise(mean_long = mean(long), mean_lat = mean(lat))
happySubset_17test <- merge(happySubset_17, test, by = "Country_name")
# Plotting
plain <- theme(
axis.text = element_blank(),
axis.line = element_blank(),
axis.ticks = element_blank(),
panel.border = element_blank(),
panel.grid = element_blank(),
axis.title = element_blank(),
panel.background = element_rect(fill = "white"),
plot.title = element_text(hjust = 0.5)
)
missing_countries<-world %>% filter(world$Country_name== c('Sudan', 'Angola', 'Namibia' ,'Cuba', 'Taiwan', 'Eritrea', 'Gambia', 'Guam', 'Malaysia', 'Oman', 'Qatar', 'Swaziland', 'Syria'))
all_countries <- bind_rows(happySubset_17,missing_countries)
ggplot(data = all_countries, mapping = aes(x = long, y = lat, group = group)) +
geom_polygon(aes(fill = `Life Ladder`), colour = "black") +
scale_fill_distiller(palette ="YlOrRd", direction = -1) +
ggtitle("Life Ladder Indext (LLI) for 2017") +
plain +
geom_text(data= happySubset_17test,aes(x= mean_long, y= mean_lat, label= Country_name),
color = "black", check_overlap = TRUE, size = 3, hjust = 0) The 2017-happiness subset reflects similar behaviour to the table of the happiests regions in the world. In this sense, we can see that the happiest countries are comprised in the regions displaying the full time frame, but for 2017. The absolute happiest country is Finland with a happiness score of 7.78, followed by Denmark with a score of 7.59. On the third place comes very close Norway with a score of 7.57. Similarly, the bottom 3 places were held by Afghanistan (2.66), South Sudan (2.81) and Rwanda (3.10)
- Note: the countries displayed in grey correspond to those where the life ladder is not available for the year of 2017
Happiness per selected countries: Poland, Ecuador, Azerbaijan for all years
ecuador_all=total[total$Country_name == "Ecuador",]
poland_all=total[total$Country_name == "Poland",]
azerbaijan_all=total[total$Country_name == "Azerbaijan",]
countries_allyears=rbind(ecuador_all,poland_all,azerbaijan_all)
# plot separately
ggplot(countries_allyears, aes(fill=countries_allyears$Country_name, y=countries_allyears$`Life Ladder`, x=countries_allyears$Year)) +
geom_bar(width = 0.6, position= position_dodge(width=0.5),stat="identity", colour="black") +
facet_wrap(~Country_name, scales = "free_y",ncol=3) +
theme_bw() +
theme(strip.text = element_text(size=15, face="bold"))+
theme(legend.position="none")+
#theme(panel.grid.major = element_line(colour = "black", size = 0.2))+
#theme(panel.grid.minor = element_line(colour = "black", size = 0.2))+
theme(axis.text.x = element_text(angle = 30, hjust =1, vjust =0.5, size=12))+
labs(x = expression(paste("Years")), y = expression(paste("Happiness evolution per selected countries")))Happiness per selected countries: Poland, Ecuador, Azerbaijan for 2017
ecuador=happySubset_17[happySubset_17$Country_name == "Ecuador",]
poland=happySubset_17[happySubset_17$Country_name == "Poland",]
azerbaijan=happySubset_17[happySubset_17$Country_name == "Azerbaijan",]
countries=rbind(ecuador,poland,azerbaijan)
countries$Year = as.character(countries$Year)
ggplot(countries, aes(fill=countries$Country_name, y=countries$`Life Ladder`, x=countries$Year)) +
geom_bar(width = 0.6, position= position_dodge(width=0.8),stat="identity", colour="black") +
labs(x = expression(paste("Years")), y = expression(paste("Happiness index per selected countries in the year of 2017")), fill = "Countries") +
coord_flip()On the micro-level, we have selected 3 countries to present to the class, being these: Azerbaijan, Ecuador and Poland. The first 2 as they represent the origin countries of the authors, and the third one to be able to analyze the current country of residence. From this horizontal histogram, we can observe that the happiest country among the 3 in 2007 is Poland, followed by Ecuador and Azerbaijan. Although the structural differences among the countries are very different, the happiness score are not that disperse, being these: Poland 6.20, Ecuador 5.83, and Azerbaijan 5.15. if we consider the inequality and log income per capita we see that the same relationship of the order holds, being the wealthiest: Poland with a log GDP per capita of 10.211, a GINI(N/A), Azerbaijan with a log GDP of 9.670 and a Gini of 0.21 and Ecuador with a log GDP per capita of 9.266, and a GINI of 0.496. Even though there are major differences among happiness index and log GDP there are differences in Gini coefficient where Ecuador has more than twice as much inequality as Azerbaijan.
Distribution of the log of the income GDP per capita and inequality, 2017
gdp_2017<-subset(total, total$Year==2017)
gdp_2017 <- gdp_2017 %>%
dplyr::select("Country_name", "Life Ladder", "Country Code", "Region", "Year", "Population", "Log GDP per capita", "Generosity", "GINI index (World Bank estimate), average 2000-16")
one <- ggplot(gdp_2017, aes(x = `Log GDP per capita`, y = Region, fill = Region )) +
geom_density_ridges() +
theme_ridges() +
labs (title = "Distribution of the log of the income GDP per capita and Inequality, 2017", x= "GINI Index") +
theme(legend.position = "none")
two <- ggplot(gdp_2017, aes(x = `GINI index (World Bank estimate), average 2000-16`, y = Region, fill = Region )) +
geom_density_ridges() +
theme_ridges() +
labs (x= "Log GDP per capita") +
theme(legend.position = "none")
#Loading the function multiplot
multiplot <- function(..., plotlist = NULL, file, cols = 1, layout = NULL) {
require(grid)
plots <- c(list(...), plotlist)
numPlots = length(plots)
if (is.null(layout)) {
layout <- matrix(seq(1, cols * ceiling(numPlots/cols)),
ncol = cols, nrow = ceiling(numPlots/cols))
}
if (numPlots == 1) {
print(plots[[1]])
} else {
grid.newpage()
pushViewport(viewport(layout = grid.layout(nrow(layout), ncol(layout))))
for (i in 1:numPlots) {
matchidx <- as.data.frame(which(layout == i, arr.ind = TRUE))
print(plots[[i]], vp = viewport(layout.pos.row = matchidx$row,
layout.pos.col = matchidx$col))
}
}
}
multiplot(two,one)The 2017 country distribution displayed by region and their correspondent log GDP per capita displays the inner wealth distribution of the countries. In this graph we can see that Western Europe is by far the wealthiest region with a distribution that resembles a bell curve. In contrast, Sub-Saharan Africa presents a distribution closer to the axis which translates into lower log GDP per capita. Plus, we can see that specially the distribution of the countries in this region is skewed to the right which denotes inequality in the distribution. As we account also for Gini coefficient information we will be also presenting this relation.
Similarly to the log income distribution per capita, the gini coefficient places the most equal regions in 2017 Central and Eastern Europe followed by Western Europe. Converserly, the most inequal regions are Sub-Saharan Africa, and Latin America and the Caribbean. It is important to mention that these distributions show the inequality levels in each country, reason why we can observe in Sub-Saharan Africa a smaller distribution on the right side of the axis, denoting a much higher level of inequality that represents the countries of: Botswana with a Gini index of (0.62) followed by South Africa (0.62).
From the condensed graph, we can say that Sub-Saharan Africa is a region with both: low log income per capita and high Gini inequality index.
Streamgraph of Generosity Perception
generosity = subset(total,select= c( 1, 2, 8, 21))
generosity_mean <- generosity %>%
group_by(Region, Year) %>%
summarise(Mean.response = mean(Generosity, na.rm = TRUE))
generosity_mean %>%
group_by(Year, Region) %>%
streamgraph("Region", "Mean.response", "Year")%>%
sg_axis_x(1, "year", "%Y") %>%
sg_legend(TRUE, "Region ")From the graph we can get insights regarding the generosity scores in the region for the time scope, being the most generous regions on average: Australia and New Zealand, followed by South Eastern Asia, and North America.
Linear regression approximation between Happiness Index and Log GDP Per Capita and GINI
three <-ggplot(data=gdp_2017,aes(x=gdp_2017$`GINI index (World Bank estimate)`,y=gdp_2017$`Life Ladder`, colour = Region, size = Population))+
geom_point() +
labs (title = "Linear regression approximation between Happiness Index and Log GDP Per Capita and GINI
", y= "Happiness Index", x= "GINI") +
geom_abline(intercept = 7.045, slope = -4.169) + guides (size = FALSE)
four <- ggplot(data=gdp_2017,aes(x=gdp_2017$`Log GDP per capita`,y=gdp_2017$`Life Ladder`, colour = Region, size = Population))+
geom_point() +
labs (y= "Happiness Index", x= "Log GDP per capita") +
geom_abline(intercept = -1.1414 , slope = 0.7145) + guides (size = FALSE)
ggarrange(three, four,
labels = c("A", "B"),
ncol = 1, nrow = 2, common.legend = TRUE, legend = "right")From the regression results, we can interpret that the Gini inequality index has a strong significant negative relationship with the happiness index as one point increase in inequality, will decrease 4.16 points in the happiness index score.
From the regression results, among the log of the GDP and its relationship with the happiness index score, we can see that the inner wealth of a country has a strong significant positive relationship. As 1% increase in the log of the GDP per capita will increase 0.0071 points in the happiness index score.
- Note: The size of the points corresponds to the population density
total2=total
happiness_ranked <- total2 %>%
group_by(Year) %>%
# The * 1 makes it possible to have non-integer ranks while sliding
mutate(rank = rank(-`Life Ladder`),
Value_rel = `Life Ladder`/`Life Ladder`[rank==1],
Value_lbl = paste0(" ",round(`Life Ladder`,2))) %>%
group_by(Country_name) %>%
filter(rank <=10) %>%
ungroup()
happiness_ranked_2018<- happiness_ranked %>% filter(happiness_ranked$Year==2018)Animated bar chart race for the Happiness Index by Top 10 countries, 2005- 2018
staticplot = ggplot(happiness_ranked, aes(rank, group = Country_name,
fill = as.factor(Country_name), color = as.factor(Country_name))) +
geom_tile(aes(y = `Life Ladder`/2,
height = `Life Ladder`,
width = 0.9), alpha = 0.8, color = "black") +
geom_text(aes(y = `Life Ladder`/2, label = paste(Country_name, " ")), size=12, vjust = 0.2, hjust = 1, colour = "black") +
geom_text(aes(y=`Life Ladder`,label = Value_lbl, hjust=0), size=8, colour = "black") +
coord_flip(clip = "off", expand = FALSE) +
scale_y_continuous(labels = scales::comma) +
scale_x_reverse() +
guides(color = FALSE, fill = FALSE) +
theme(axis.line=element_blank(),
axis.text.x=element_blank(),
axis.text.y=element_blank(),
axis.ticks=element_blank(),
axis.title.x=element_blank(),
axis.title.y=element_blank(),
legend.position="none",
panel.background=element_blank(),
panel.border=element_blank(),
panel.grid.major=element_blank(),
panel.grid.minor=element_blank(),
panel.grid.major.x = element_line( size=.1, color="grey" ),
panel.grid.minor.x = element_line( size=.1, color="grey" ),
plot.title=element_text(size=25, hjust=0.5, face="bold", colour="black", vjust=-1),
plot.subtitle=element_text(size=16, hjust=0.5, face="italic", color="grey"),
plot.caption =element_text(size=18, hjust=0.5, face="italic", color="grey"),
plot.background=element_blank(),
plot.margin = margin(2,2, 2, 4, "cm"))
###Animating it
###############################
anim = staticplot + transition_states(Year, transition_length = 4, state_length = 1) +
view_follow(fixed_x = TRUE) +
labs(title = 'Happiness Index per Year : {closest_state}',
subtitle = "Top 10 Countries",
caption = "Happiness Index per Year | Data Source: Happiness World Report 2019")
animate(anim, 300, fps = 15, width = 1200, height = 1000,
renderer = gifski_renderer("gganim.gif"))